************************************************
***Title: pollen_clean.do
***Creators: Joelle Abramowitz, Shooshan Danagoulian, and Owen Fleming*
***Notes: This file cleans the pollen data from NAB and constructs a county-date panel. To do this, it applies pollen measurements from a given station to the county of the measurement station and the neighboring counties.

*For questions, contact
*Owen Fleming
*hg3490@wayne.edu
************************************************


**********SETUP
cd ${path}/data/pollen
clear all


**********IMPORT
*Import each year of data from separate sheet of NAB excel spreadsheet
forvalues i=2006/2018{
import excel using "Wayne State - NAB Data (1)", clear firstrow sheet(`i')
rename *, lower
save pollen_`i', replace	
}

use pollen_2006, clear


**********APPEND
forvalues i=2007/2018{
	append using pollen_`i'

}
rename *, lower


**********VARIABLE CREATION
*Time variables
gen month=month(date)
gen year=year(date)
gen day=day(date)
drop if missing(date)

*Pollen variables
egen totalpollen_tree=rowtotal(acer alnus arecaceae betula  carya celtis corylus cupressaceae fagus fraxinus juglans ligustrum liquidambar morus myrica olea othertreepollen pinaceae platanus populus prosopis pseudotsuga quercus salix tilia tsuga typha ulmus)

egen totalpollen_grassweed=rowtotal(ambrosia artemisia asteraceaeexcludingambrosiaa carpinusostrya chenopodiaceaeamaranthaceae cyperaceae eupatorium gramineaepoaceae othergrasspollen otherweedpollen plantago rumex urticaceae)

egen totalsporecount=rowtotal(agrocybetype alternaria ascosporesundifferentiated basidiosporesundifferentiated botrytis cercospora chaetomium cladosporium coprinustype curvularia diatrypaceae dreshslerahelminthosporium epicoccum fusarium ganoderma leptosphaeriatype nigrospora oidiumerysiphe otherfungi penicilliumaspergillus periconia peronospora pithomyces pleospora polythrincium rusts smutsmyxomycetes stemphylium torula unidentifiedfungi)

egen totalpollen=rowtotal(totalpollen_tree totalpollen_grassweed)

*Geographies
gen county_name=""
gen county=.
gen city =.
gen state=.

replace county_name="Buncombe" if city_name=="Asheville"
replace county=21  if county_name=="Buncombe"
replace city=. if city_name=="Asheville"
replace state=37  if city_name=="Asheville"

replace county_name="Baltimore" if city_name=="Baltimore"
replace county=510  if county_name=="Baltimore"
replace city=50 if city_name=="Baltimore"
replace state =24  if city_name=="Baltimore"

replace county_name="Montgomery" if city_name=="Dayton"
replace county=113  if county_name=="Montgomery"
replace city=. if city_name=="Dayton"
replace state=39  if city_name=="Dayton"

replace county_name="Williamson" if city_name=="Georgetown"
replace county=491  if county_name=="Williamson"
replace city=. if city_name=="Georgetown"
replace state =48  if city_name=="Georgetown"

replace county_name="Greenville" if city_name=="Greenville"
replace county=45  if county_name=="Greenville"
replace city=1040 if city_name=="Greenville"
replace state =45  if city_name=="Greenville"

replace county_name="Harris" if city_name=="Houston (Station 2)"
replace county=201 if county_name=="Harris"
replace city =3280 if city_name=="Houston (Station 2)"
replace state=48 if city_name=="Houston (Station 2)"

replace county_name="Harris" if city_name=="Houston (Station 1)"
replace county=201 if county_name=="Harris"
replace city =3280 if city_name=="Houston (Station 1)"
replace state=48 if city_name=="Houston (Station 1)"

replace county_name="La Crosse" if city_name=="La Crosse"
replace county=63 if county_name=="La Crosse"
replace city =. if city_name=="La Crosse"
replace state=55 if city_name=="La Crosse"

replace county_name="Ontario" if city_name=="London"
replace county=. if county_name=="Ontario"
replace city =. if city_name=="London"
replace state=. if city_name=="London"

replace county_name="Cobb" if city_name=="Marietta"
replace county=67 if county_name=="Cobb"
replace city =. if city_name=="Marietta"
replace state=13 if city_name=="Marietta"

replace county_name="Hennepin" if city_name=="Minneapolis"
replace county=53 if county_name=="Hennepin"
replace city =. if city_name=="Minneapolis"
replace state=27 if city_name=="Minneapolis"

replace county_name="Cattaraugus" if city_name=="Olean"
replace county=9 if county_name=="Cattaraugus"
replace city =4460 if city_name=="Olean"
replace state=36 if city_name=="Olean"

replace county_name="Sarpy" if city_name=="Papillion"
replace county=153 if county_name=="Sarpy"
replace city =. if city_name=="Papillion"
replace state=31 if city_name=="Papillion"

replace county_name="Multnomah" if city_name=="Portland"
replace county=51 if county_name=="Multnomah"
replace city =. if city_name=="Portland"
replace state=41 if city_name=="Portland"

replace county_name="King" if city_name=="Seattle"
replace county=33 if county_name=="King"
replace city =1960 if city_name=="Seattle"
replace state=53 if city_name=="Seattle"

replace county_name="Montgomery" if city_name=="Silver Spring"
replace county=31 if county_name=="Montgomery"
replace city =36 if city_name=="Silver Spring"
replace state=24 if city_name=="Silver Spring"

replace city_name="Waco (Station 2)" if city_name=="Waco"&loc_name=="Allergy and Asthma Center"
replace county_name="McLennan" if city_name=="Waco (Station 2)"
replace county=309 if county_name=="McLennan"
replace city =. if city_name=="Waco (Station 2)"
replace state=48 if city_name=="Waco (Station 2)"

replace city_name="Waco (Station 1)" if city_name=="Waco"&loc_name=="Allergy & Asthma Care of Waco"
replace county_name="McLennan" if city_name=="Waco (Station 1)"
replace county=309 if county_name=="McLennan"
replace city =. if city_name=="Waco (Station 1)"
replace state=48 if city_name=="Waco (Station 1)"

replace county_name="Contra Costa" if city_name=="Walnut Creek"
replace county=13 if county_name=="Contra Costa"
replace city =. if city_name=="Walnut Creek"
replace state=6 if city_name=="Walnut Creek"

duplicates drop

drop if county_name==""&year==2018 
* Note that houston may have more than one location of measurement, potentially both coming from different branches of the health department.  we drop one of them.


**********REFORMATTING AND SAVE
keep year month day date totalsporecount totalpollen_tree totalpollen_grassweed totalpollen loc_name city_name state_abb zip country county_name county city state

rename totalpollen_tree treepollen
rename totalpollen_grassweed grassweedpollen
rename totalpollen totalpollencount
rename state_abb state_abbrev
rename city_name city_n
rename county_name county_n
gen city_name=strlower(city_n)
gen county_name=strlower(county_n)
drop city_n county_n
format date %td

duplicates drop city_name date, force


**********MERGE COUNTY ADJACENCY FILE
*Format adjacency file into wide in preparation for merging
preserve
use county_adjacency2010, clear

gen fipsneighbor_double=real(fipsneighbor)
format %10.0g fipsneighbor_double
drop fipsneighbor 
rename fipsneighbor_double fipsneighbor

gen fipscounty_double=real(fipscounty)
format %10.0g fipscounty_double
drop fipscounty 
rename fipscounty_double fipscounty
by fipscounty, sort: gen id=_n

reshape wide neighborname fipsneighbor, i(fipscounty) j(id)

save county_adjacency2010_wide, replace
restore

*Attach the adjacency file
drop if country=="CA"
gen fips=state*1000 + county
rename fips fipscounty
rename treepollen treepollencount
rename grassweedpollen grassweedpollencount
by fipscounty date, sort: egen totalpollen=max(totalpollencount)
by fipscounty date, sort: egen treepollen=max(treepollencount)
by fipscounty date, sort: egen grassweedpollen=max(grassweedpollencount)
by fipscounty date, sort: egen totalspore=max(totalsporecount)
collapse (mean) totalpollen treepollen grassweedpollen totalspore (firstnm) month year day city_name state_abbrev  county_name county city state country, by(fipscounty date)

merge m:m fipscounty using county_adjacency2010_wide

drop if _merge==2

drop _merge

forvalues i=1/15 {
	gen totalpollen`i'=totalpollen
	gen treepollen`i'=treepollen
	gen grassweedpollen`i'=grassweedpollen
	gen totalspore`i'=totalspore
}

drop totalpollen treepollen grassweedpollen totalspore

egen fips_date=group(fipscounty date)

reshape long totalpollen treepollen grassweedpollen totalspore neighborname fipsneighbor, i(fips_date) j(id)

drop if missing(fipsneighbor)

duplicates drop fipsneighbor date, force

*winsorize high values of pollen of the 100 or 200 highest values to the next highest value.  This corrects for possible measurement error.
winsor totalpollen, h(200) gen(totalpollen_h200)
winsor treepollen, h(100) gen(treepollen_h100)
winsor grassweedpollen, h(100) gen(grassweedpollen_h100)
winsor totalspore, h(100) gen(totalspore_h100)


**********FINAL CLEANING
*drop variables
drop fips_date id city state city_name county county_name country

*basic renaming
rename fipscounty county_measurement
rename countyname countyname_measurement
rename fipsneighbor county 
rename neighborname countyname

*generate state variable
gen state = floor(county/1000)

*get proper statename
merge m:1 state using fips_to_statename
keep if _merge==3
drop _merge

order county countyname state statename date county_measurement countyname_measurement totalpollen treepollen grassweedpollen totalspore totalpollen_h200 treepollen_h100 grassweedpollen_h100 totalspore_h100 month year day 

sort county date


**********SAVE
save pollen, replace


**********REVERT DIRECTORY
cd ${path}
